Dans ce fichier, j’utilise les gènes que l’on a déterminé après l’étude des données RNAseq comme étant Activé en présence de SvbACT et les gènes considérés comme étant réprimés en présence de SvbREP.
Toutes les figures et tableaux sont disponibles ici (https://mycore.core-cloud.net/index.php/s/dBITIcJgQx6P7Fp) ou ici (https://www.dropbox.com/sh/irhqsmaemsicwqn/AAA34FiarJPLLjoOZ3wQP1w6a?dl=0) pour le script et le RData ( pasassez de place sur la dropbox et MyCore ne fonctionne plus )
#chargement des données RNAseq
load("edgeR_Result.RData")
#chargement des packages
library(pheatmap)
library(chipenrich)
library(RColorBrewer)
library(vioplot)
library(VennDiagram)
library(ChIPpeakAnno)
library(TxDb.Dmelanogaster.UCSC.dm6.ensGene)
library(DT)
library(plotly)
'%!in%' <- function(x,y)!('%in%'(x,y))
downREP_FP = rownames(lrt.2.tables$RepVsContr)[lrt.2.tables$RepVsAct$logFC < 0 & lrt.2.tables$ActVsRep$BH.bin == 1]
upACT_FP = rownames(lrt.2.tables$ActVsContr)[lrt.2.tables$ActVsContr$logFC > 0 & lrt.2.tables$ActVsRep$BH.bin == 1]
pvalue_Act_REP = rownames(lrt.2.tables$ActVsRep)[lrt.2.tables$ActVsRep$BH.bin == 1]
updown_Svb = row.names(lrt.2.tables$ActVsRep)[lrt.2.tables$ActVsContr$logFC > 0 & lrt.2.tables$RepVsContr$logFC < 0 & lrt.2.tables$ActVsRep$BH.bin == 1]
gene_cible_updown = upACT_FP%in%downREP_FP
downREP_FP_tab = lrt.2.tables$RepVsContr[lrt.2.tables$RepVsAct$logFC < 0,]
updown_Svb_tab= as.data.frame( round( cbind(lrt.2.tables$ActVsRep$logFC[rownames(lrt.2.tables$ActVsRep) %in% updown_Svb], lrt.2.tables$ActVsContr$logFC[rownames(lrt.2.tables$ActVsContr) %in% updown_Svb],lrt.2.tables$RepVsContr$logFC[rownames(lrt.2.tables$RepVsContr)%in%updown_Svb]) , 2) , row.names = updown_Svb )
colnames(updown_Svb_tab) = c("ActvsRep","ActvsCtrl","RepvsCtrl")
updown_Svb_tab_order = updown_Svb_tab[,2:3][order(updown_Svb_tab$ActvsRep , decreasing = T),]
datatable(updown_Svb_tab_order)
# Création des heatmap {.tabset .tabset-fade .tabset-pills}
#Ici je crée des heatmaps permettant de voir les gènes ***cibles*** et leur logFC dans les différentes conditions *Act/Ctrl*; *Rep/Ctrl* et *Act/Rep*.
# breaksList = seq(-7, 7, by = 1)
# pheatmap(updown_Svb_tab_order, main="Heatmap", color = colorRampPalette(rev(brewer.pal(n = 7, name = "PiYG")))(length(breaksList)), breaks = breaksList ,cluster_rows = F, cluster_cols = F, cellwidth = 30,border_color = "black", cellheight = 15,display_numbers = T, number_color = "black", filename = "/Users/alexmanchenoferris/Documents/Chipseq_ActRep/heatmap/heatmap_updown.pdf" )
#
# pheatmap(updown_Svb_tab_order, main="Heatmap", color = colorRampPalette(rev(brewer.pal(n = 7, name = "PiYG")))(length(breaksList)), breaks = breaksList ,cluster_rows = F, cluster_cols = F, cellwidth = 30,border_color = "black", cellheight = 15,display_numbers = T, number_color = "black", silent = F )
#
# pheatmap(updown_Svb_tab_order[1:100,], main="Heatmap", color = colorRampPalette(rev(brewer.pal(n = 7, name = "PiYG")))(length(breaksList)),cluster_rows = F, cluster_cols = F, cellwidth = 30,border_color = "black", cellheight = 15,display_numbers = T, number_color = "black", filename = "/Users/alexmanchenoferris/Documents/Chipseq_ActRep/heatmap/heatmap_updown_100premiers.pdf" )
#
# pheatmap(updown_Svb_tab_order[1:57,], main="Heatmap", color = colorRampPalette(rev(brewer.pal(n = 7, name = "PiYG")))(length(breaksList)), breaks = breaksList ,cluster_rows = F, cluster_cols = F, cellwidth = 30,border_color = "black", cellheight = 15,display_numbers = T, number_color = "black", filename = "/Users/alexmanchenoferris/Documents/Chipseq_ActRep/heatmap/heatmap_updown_57premiers.pdf" )
#
# pheatmap(updown_Svb_tab_order[58:114,], main="Heatmap", color = colorRampPalette(rev(brewer.pal(n = 7, name = "PiYG")))(length(breaksList)), breaks = breaksList ,cluster_rows = F, cluster_cols = F, cellwidth = 30,border_color = "black", cellheight = 15,display_numbers = T, number_color = "black", filename = "/Users/alexmanchenoferris/Documents/Chipseq_ActRep/heatmap/heatmap_updown_57seconds.pdf" )
#
# pheatmap(updown_Svb_tab_order[115:171,], main="Heatmap", color = colorRampPalette(rev(brewer.pal(n = 7, name = "PiYG")))(length(breaksList)), breaks = breaksList ,cluster_rows = F, cluster_cols = F, cellwidth = 30,border_color = "black", cellheight = 15,display_numbers = T, number_color = "black", filename = "/Users/alexmanchenoferris/Documents/Chipseq_ActRep/heatmap/heatmap_updown_57_3.pdf" )
#
# pheatmap(updown_Svb_tab_order[172:228,], main="Heatmap", color = colorRampPalette(rev(brewer.pal(n = 7, name = "PiYG")))(length(breaksList)), breaks = breaksList ,cluster_rows = F, cluster_cols = F, cellwidth = 30,border_color = "black", cellheight = 15,display_numbers = T, number_color = "black", filename = "/Users/alexmanchenoferris/Documents/Chipseq_ActRep/heatmap/heatmap_updown_57_4.pdf" )
#
# pheatmap(updown_Svb_tab_order[229:285,], main="Heatmap", color = colorRampPalette(rev(brewer.pal(n = 7, name = "PiYG")))(length(breaksList)), breaks = breaksList ,cluster_rows = F, cluster_cols = F, cellwidth = 30,border_color = "black", cellheight = 15,display_numbers = T, number_color = "black", filename = "/Users/alexmanchenoferris/Documents/Chipseq_ActRep/heatmap/heatmap_updown_57_5.pdf" )
#
#
# #automatisation
# gene_num = seq(1,nrow(updown_Svb_tab_order),15)
# pas = seq(1,length(gene_num)-1,1)
#
# for (i in pas){
# print(i)
# d = paste('Heatmap', gene_num[i] , gene_num[i+1]-1, sep = "-")
# name_file = paste(d,"pdf" , sep = ".")
#
# pheatmap(updown_Svb_tab_order[gene_num[i]:(gene_num[i+1]-1), ],main=d, color = colorRampPalette(rev(brewer.pal(n = 7, name = "PiYG")))(length(breaksList)), breaks = breaksList ,cluster_rows = F, cluster_cols = F, cellwidth = 30,border_color = "black", cellheight = 15,display_numbers = T, number_color = "black", filename = name_file, fontsize = 7)
# if (i == length(pas)){
# d = paste('/Users/alexmanchenoferris/Documents/Chipseq_ActRep/heatmap/Heatmap_auto', gene_num[i+1] , nrow(updown_Svb_tab_order), sep = "-")
# name_file = paste(d,"pdf" , sep = ".")
# pheatmap(updown_Svb_tab_order[gene_num[i+1]:nrow(updown_Svb_tab_order),],main=d, color = colorRampPalette(rev(brewer.pal(n = 7, name = "PiYG")))(length(breaksList)), breaks = breaksList ,cluster_rows = F, cluster_cols = F, cellwidth = 30,border_color = "black", cellheight = 15,display_numbers = T, number_color = "black", filename = name_file, fontsize = 7)
# }
#
# }
On obtient un groupe de gènes cibles de 285 gènes qui sont activés et réprimés par Svb tout en ayant une différence de comportement significatives quand on compare les conditions Activateur vs Répresseur.
On veut maintenant attribuer à ces gènes cibles les pics de fixation de Shavenbaby. Pour cela j’utilise deux approches une première que l’on pourrait qualifier de Peak centric, puis une seconde que l’on peut dire Gene centric.
Ce que je fais ici, c’est prendre les régions des fixations de Svb ( analyse des données de ChIPseq ) et les associer à tous les gènes de la drosophile. Pas vraiment ce que l’on cherche ici!
#Récupération des pics de Svb
load("/Users/alexmanchenoferris/Documents/Chipseq_ActRep/Myworkspace.RData")
B1_peaks = read.table("/Users/alexmanchenoferris/Documents/Chipseq_ActRep/macs/stringent/macs2_B1_peaks.narrowPeak_chr.bed",sep = "\t", comment.char = "#", header = F, stringsAsFactors = F)
Svb_peaks = B1_peaks[,1:3]
colnames(Svb_peaks) = c("chrom","start","end")
#histogramme de la distance des peaks
plot_dist_to_tss(peaks = Svb_peaks, genome = "dm6")
library(clusterProfiler)
library(org.Dm.eg.db)
et <- bitr(updown_Svb, fromType="SYMBOL", toType="ENTREZID", OrgDb="org.Dm.eg.db")
head(et)
## SYMBOL ENTREZID
## 1 Acer 34189
## 2 Acox57D-d 37446
## 3 Acox57D-p 37445
## 4 Adk1 39396
## 5 AhcyL1 38342
## 6 alpha-Est7 40901
manquant = read.table("/Users/alexmanchenoferris/Documents/Chipseq_ActRep/transform_geneID.txt")
colnames(manquant) = c("ID_htseq", "FlybaseID", "Flybase_ID", "gene_symbol_Flybase")
et_manquant = bitr(manquant$gene_symbol_Flybase, fromType="SYMBOL", toType="ENTREZID", OrgDb="org.Dm.eg.db")
gene_set = rbind(et, et_manquant)
all_ENTREZID = bitr( rownames(lrt.2.tables$ActVsContr),fromType="SYMBOL", toType="ENTREZID", OrgDb="org.Dm.eg.db")
'%!in%' <- function(x,y)!('%in%'(x,y))
all_manquant = rownames(lrt.2.tables$ActVsContr)[rownames(lrt.2.tables$ActVsContr)%!in%all_ENTREZID$SYMBOL]
peak_assign_peak_centric = chipenrich(peaks = Svb_peaks, genome = 'dm6', genesets = "reactome",
locusdef = "1kb", qc_plots = FALSE, out_name = NULL, n_cores = 1)
## Reading peaks from data.frame...
## Assigning peaks to genes with assign_peaks(...) ..
## Test: ChIP-Enrich
## Genesets: Reactome
## Running tests..
datatable(peak_assign_peak_centric$peaks, caption = "Peaks assignement to genes Reactomes")
datatable(peak_assign_peak_centric$peaks_per_gene,rownames = F,caption = "Count fo genes per peaks ", filter = "top")
datatable(peak_assign_peak_centric$results[,1:10],rownames = F,caption = "Gene set enrichment test on each geneset ")
peak_assign_peak_centric = chipenrich(peaks = Svb_peaks, genome = 'dm6', genesets = "GOMF",
locusdef = "1kb", qc_plots = FALSE, out_name = NULL, n_cores = 1)
## Reading peaks from data.frame...
## Assigning peaks to genes with assign_peaks(...) ..
## Test: ChIP-Enrich
## Genesets: Gene Ontology Molecular Function
## Running tests..
datatable(peak_assign_peak_centric$peaks, caption = "Peaks assignement to genes GOMF")
datatable(peak_assign_peak_centric$peaks_per_gene,rownames = F,caption = "Count fo genes per peaks ", filter = "top")
datatable(peak_assign_peak_centric$results[,1:10],rownames = F,caption = "Gene set enrichment test on each geneset ")
peak_assign_peak_centric = chipenrich(peaks = Svb_peaks, genome = 'dm6', genesets = "GOCC",
locusdef = "1kb", qc_plots = FALSE, out_name = NULL, n_cores = 1)
## Reading peaks from data.frame...
## Assigning peaks to genes with assign_peaks(...) ..
## Test: ChIP-Enrich
## Genesets: Gene Ontology Cellular Component
## Running tests..
datatable(peak_assign_peak_centric$peaks, caption = "Peaks assignement to genes GOCC")
datatable(peak_assign_peak_centric$peaks_per_gene,rownames = F,caption = "Count fo genes per peaks ", filter = "top")
datatable(peak_assign_peak_centric$results[,1:10],rownames = F,caption = "Gene set enrichment test on each geneset ")
peak_assign_peak_centric = chipenrich(peaks = Svb_peaks, genome = 'dm6', genesets = "GOBP",
locusdef = "1kb", qc_plots = FALSE, out_name = NULL, n_cores = 1)
## Reading peaks from data.frame...
## Assigning peaks to genes with assign_peaks(...) ..
## Test: ChIP-Enrich
## Genesets: Gene Ontology Biological Process
## Running tests..
datatable(peak_assign_peak_centric$peaks, caption = "Peaks assignement to genes GOCC")
datatable(peak_assign_peak_centric$peaks_per_gene,rownames = F,caption = "Count fo genes per peaks ", filter = "top")
datatable(peak_assign_peak_centric$results[,1:10],rownames = F,caption = "Gene set enrichment test on each geneset ")
J’ai pris les pics que j’avais associé à des gènes " le plus proche " avec le package ChIPpeakAnno et je regarde combien de gènes sont commun avec les gènes cibles.
binding_gene = read.table("/Users/alexmanchenoferris/Documents/RNAseq/RNAseq_S2/gene_binding_svb.txt")
venn.diagram(
x = list( updownSvb = updown_Svb,
binding_gene = binding_gene$x),
main = "Comparaison Genes Cibles et binding ",
filename = "venndia_updown_ACTREPSign_bind.png" , col = c("purple","blue"),height = 5000,width = 5000, imagetype = "png")
## [1] 1
overlap_commun = calculate.overlap(x = list("updown" = updown_Svb , "binding" = binding_gene$x))
#calcul de la pval
phyper(length(overlap_commun$a3),length(overlap_commun$a1),nrow(RNAseq)-length(overlap_commun$a1),length(overlap_commun$a2), lower.tail = FALSE)
## [1] 4.23056e-22
bind_updown = updown_Svb_tab[rownames(updown_Svb_tab)%in%binding_gene$x,]
bind_updown$gene =rownames(bind_updown)
venn.diagram(
x = list( upAct = row.names(lrt.2.tables$ActVsContr)[lrt.2.tables$ActVsContr$logFC > 0 & lrt.2.tables$ActVsContr$BH.bin ==1 ],
downAct = row.names(lrt.2.tables$ActVsContr)[lrt.2.tables$ActVsContr$logFC < 0 & lrt.2.tables$ActVsContr$BH.bin ==1],
binding_gene = binding_gene$x),
main = "Comparaison Genes DE Act et binding ",
filename = "/Users/alexmanchenoferris/Documents/RNAseq/venndiagramm_geneciblesDirecte/venndia_DEAct_bind.png" , col = c("green","darkgreen","blue"),height = 5000,width = 5000, imagetype = "png")
## [1] 1
venn.diagram(
x = list( upAct = row.names(lrt.2.tables$ActVsContr)[lrt.2.tables$ActVsContr$logFC > 0 ],
downRep = row.names(lrt.2.tables$ActVsContr)[lrt.2.tables$ActVsContr$logFC < 0 ],
signActRep = row.names(lrt.2.tables$ActVsRep)[lrt.2.tables$ActVsRep$BH.bin == 1],
binding_gene = binding_gene$x),
main = "Comparaison upAct, downRep,signActRep, binding ",
filename = "venndia_tout_bind.png" , col = c("green","red","blue", "purple"),height = 5000,width = 5000,imagetype = "png")
## [1] 1
venn.diagram(
x = list( downAct = row.names(lrt.2.tables$ActVsContr)[lrt.2.tables$ActVsContr$logFC < 0],
binding_gene = binding_gene$x),
main = "Comparaison Genes downACT et binding ",
filename = "/Users/alexmanchenoferris/Documents/RNAseq/venndiagramm_geneciblesDirecte/venndia_downAct_bind.png" , col = c("green","blue"),height = 5000,width = 5000, imagetype = "png")
## [1] 1
#calcul de la pval
overlap_commun = calculate.overlap(x = list("downAct" = row.names(lrt.2.tables$ActVsContr)[lrt.2.tables$ActVsContr$logFC < 0] , "binding" = binding_gene$x))
phyper(length(overlap_commun$a3),length(overlap_commun$a1),nrow(RNAseq)-length(overlap_commun$a1),length(overlap_commun$a2), lower.tail = FALSE)
## [1] 0.0005195547
venn.diagram(
x = list( downRep = row.names(lrt.2.tables$RepVsContr)[lrt.2.tables$RepVsContr$logFC < 0],
binding_gene = binding_gene$x),
main = "Comparaison Genes downRep et binding ",
filename = "/Users/alexmanchenoferris/Documents/RNAseq/venndiagramm_geneciblesDirecte/venndia_downRep_bind.png" , col = c("red","blue"),height = 5000,width = 5000,imagetype = "png")
## [1] 1
overlap_commun = calculate.overlap(x = list("downRep" = row.names(lrt.2.tables$RepVsContr)[lrt.2.tables$RepVsContr$logFC < 0] , "binding" = binding_gene$x))
phyper(length(overlap_commun$a3),length(overlap_commun$a1),nrow(RNAseq)-length(overlap_commun$a1),length(overlap_commun$a2), lower.tail = FALSE)
## [1] 8.467327e-10
venn.diagram(
x = list( upRep = row.names(lrt.2.tables$RepVsContr)[lrt.2.tables$RepVsContr$logFC > 0],
downRep = row.names(lrt.2.tables$RepVsContr)[lrt.2.tables$RepVsContr$logFC < 0],
binding_gene = binding_gene$x),
main = "Comparaison Genes DERep et binding ",
filename = "../venndiagramm_geneciblesDirecte/venndia_DE_Rep_bind.png" , col = c("red","darkred","blue"),height = 5000,width = 5000,imagetype = "png")
## [1] 1
venn.diagram(
x = list( upRep = row.names(lrt.2.tables$RepVsContr)[lrt.2.tables$RepVsContr$logFC > 0 & lrt.2.tables$RepVsContr$BH.bin == 1],
binding_gene = binding_gene$x),
main = "Comparaison Genes upRep et binding ",
filename = "/Users/alexmanchenoferris/Documents/RNAseq/venndiagramm_geneciblesDirecte/venndia_upRep_bind.png" , col = c("red","blue"),height = 5000,width = 5000,imagetype = "png")
## [1] 1
overlap_commun = calculate.overlap(x = list("upRep" = row.names(lrt.2.tables$RepVsContr)[lrt.2.tables$RepVsContr$logFC > 0] , "binding" = binding_gene$x))
phyper(length(overlap_commun$a3),length(overlap_commun$a1),nrow(RNAseq)-length(overlap_commun$a1),length(overlap_commun$a2), lower.tail = FALSE)
## [1] 1.23871e-27
venn.diagram(
x = list( upAct = row.names(lrt.2.tables$ActVsContr)[lrt.2.tables$ActVsContr$logFC > 0 & lrt.2.tables$ActVsContr$BH.bin == 1],
downRep = row.names(lrt.2.tables$RepVsContr)[lrt.2.tables$RepVsContr$logFC < 0 & lrt.2.tables$RepVsContr$BH.bin == 1 ],
binding_gene = binding_gene$x),
main = "Comparaison Genes upAct , downRep et binding ",
filename = "/Users/alexmanchenoferris/Documents/RNAseq/venndiagramm_geneciblesDirecte/venndia_upAct_downRep_binding.png" ,height = 5000,width = 5000, imagetype = "png")
## [1] 1
venn.diagram(
x = list( upAct = row.names(lrt.2.tables$ActVsContr)[lrt.2.tables$ActVsContr$logFC > 0],
signActRep = rownames(lrt.2.tables$ActVsRep)[lrt.2.tables$ActVsRep$BH.bin == 1],
downRep = row.names(lrt.2.tables$RepVsContr)[lrt.2.tables$RepVsContr$logFC < 0],
binding_gene = binding_gene$x),
main = "Comparaison Genes Cibles et binding ",
filename = "/Users/alexmanchenoferris/Documents/RNAseq/venndiagramm_geneciblesDirecte/venndia_tout.png" ,height = 5000,width = 5000,imagetype = "png")
## [1] 1
J’obtiens avec cette première méthode 172 gènes cibles présentant une fixation de Svb. On peut dire que ce sont des gènes cibles directes (ils sont avec leur LogFC dans le fichier gene_centric_methode1_geneciblesdirecte.txt ).
Dans cette approche Gene centrique, ce que j’ai fais c’est de calculer la distances entre les coordonnées des gènes cibles et les coordonnées des pics de fixation de Shavenbaby.
Ici ce que je fais c’est que je prend les gènes cibles, je prend leur coordonnées et pas que leur nom contrairement à la Pics Centrique Méthode 2 puis je compare ces gènes avec les picss qui ont aussi leurs coordonnées
target_set = Dm6_gene[names(Dm6_gene)%in%updown_Svb,]
datatable(as.data.frame(target_set))
table(countOverlaps(B1.peaks,target_set))
##
## 0 1 2
## 5172 326 14
count_overlaps = as.data.frame(table(countOverlaps(B1.peaks,target_set)))
barplot(count_overlaps$Freq, main = " Nombre de gènes dans pics", col = c('#67001f','#b2182b','#d6604d'), xlab = "gènes", ylab = "pics", names.arg = count_overlaps$Var1)
overlaps_ChIPseq_RNA = as.data.frame(findOverlaps(B1.peaks, target_set,type = "any", select = "all"))
target_set_tab = as.data.frame(target_set)
target_set_tab$row = row(target_set_tab)[,1]
target_set_tab$gene = names(target_set)
table(countOverlaps(target_set,B1.peaks))
##
## 0 1 2 3 4 5 6 7 9 10 17
## 107 98 40 21 7 6 2 1 1 1 1
count_overlaps = as.data.frame(table(countOverlaps(target_set,B1.peaks)))
barplot(count_overlaps$Freq,legend.text = count_overlaps$Var1, col = c('#67001f','#b2182b','#d6604d','#f4a582','#fddbc7','#f7f7f7','#d1e5f0','#92c5de','#4393c3','#2166ac','#053061'), main = " Nombre de pics compris ans les gènes cibles" , names.arg = count_overlaps$Var1,ylab = "# genes", xlab = "# peaks")
overlaps_RNA_ChIPseq = as.data.frame(findOverlaps(target_set,B1.peaks,type = "any", select = "all"))
target_set_tab = as.data.frame(target_set)
target_set_tab$row = row(target_set_tab)[,1]
target_set_tab$gene = names(target_set)
overlaps_RNA_ChIPseq$gene = target_set_tab$gene[match(overlaps_RNA_ChIPseq$queryHits, target_set_tab$row)]
gene_overlapant_RNA_ChIPseq = unique(overlaps_RNA_ChIPseq$gene)
Les gènes CG4297, CG3036, senju, stai, mmy, Pvf3, CG7149, LKRSDH, pes, CG7627, grk, Acer, CG17834, Sema1a, Spn31A, CG7384, piwi, Samuel, dmGlut, CG6770, vir-1, CG15279, lace, c(2)M, Socs36E, ham, CG9331, CG42238, CG2201, CR43148, lt, ap, CG43340, CG1882, pdm3, CG12780, CG8213, hig, Cyp4p3, wun, brp, CG12926, Mos, CR45140, CG6553, CR44367, Cyp6a22, Cyp6a20, CG30091, Cdk4, CR44375, mbl, CG15111, cora, CG11961, sm, exu, Lapsyn, CG9485, Acox57D-p, CG34370, CG2852, CG3831, CG9815, ItgaPS5, Orcokinin, CR45805, CR43334, bab2, rho, pns, CG5707, CG45186, Src64B, axo, CG18769, CR32385, CG8620, CG8596, pst, Fhos, bol, path, CG6084, Adk1, mirr, nuf, fz, bbg, Gbs-70E, sff, CG7341, CG9629, CG13252, Eip78C, CG7470, CR45174, CG11739, CG34357, Cdep, Cerk, alpha-Est7, CR45549, CR18228, CG8112, CG18473, RhoL, hth, CG14688, CG5281, ClC-a, GC1, GstD9, GstD1, GstD3, NANS, sqd, CG34383, CG14855, RpS5b, CR45600, CG3984, alpha-Man-IIb, spn-E, glob1, CG6006, CG6126, Fas1, Mvl, dnd, wake, Lsd-1, CG33111, Pli, CG6178, crb, mld, CG10550, CG14545, CG31324, CG12290, Tusp, CG12428, CG11897, Zip99C, CG9743, Fer1HCH, CG1544, dati, zfh2, apolpp, Cadps, CG3777, CG32816, CG32795, CR44498, CG14438, CG9650, CR44357, CG42797, Gip, l(1)G0289, Karl, CG43921, Pde9, rdgB, inaE, sog, Rhp, dpr18, Grip84, Hers, amn, CG32521, l(1)G0196, CR43493, CR44997, fog possédant des pics.
dist_peak_gene = distanceToNearest(B1.peaks,target_set,ignore.strand = T,select = "all")
dist_tab_peak_gene = as.data.frame(dist_peak_gene)
dist_tab_peak_gene$gene = target_set_tab$gene[match(dist_tab_peak_gene$subjectHits, target_set_tab$row)]
overlaps_ChIPseq_RNA$distance = dist_tab_peak_gene$distance[match(overlaps_ChIPseq_RNA$queryHits, dist_tab_peak_gene$queryHits)]
overlaps_ChIPseq_RNA$gene = target_set_tab$gene[match(overlaps_ChIPseq_RNA$subjectHits, target_set_tab$row)]
colnames(dist_tab_peak_gene) = c("peaks_id","target_id","distance","gene")
dist_tab_peak_gene_10kb =dist_tab_peak_gene[dist_tab_peak_gene$distance <= 10000,]
dist_tab_peak_gene_5kb = dist_tab_peak_gene[dist_tab_peak_gene$distance <=5000,]
dist = distanceToNearest(target_set,B1.peaks,ignore.strand = T,select = "all")
dist_tab = as.data.frame(dist)
dist_tab$gene = target_set_tab$gene[match(dist_tab$queryHits, target_set_tab$row)]
overlaps_RNA_ChIPseq$distance = dist_tab$dist[match(overlaps_RNA_ChIPseq$gene, dist_tab$gene)]
row = c("Minimun","1st Qu.","Median","Mean","3rd Qu.", "Max")
distance = c(0,0,0,4453,0,1049564)
summary = data.frame(distance,row.names = row)
datatable(summary)
On peut observer qu’en moyenne les pics présentent une distance à moins de 5kb des gènes.
round(c(100*98/178,100*40/178,100*21/178,100*7/178,6*100/178,100*2/178,1*100/178,100*1/178,100*1/178,100*1/178),2)
## [1] 55.06 22.47 11.80 3.93 3.37 1.12 0.56 0.56 0.56 0.56
resultat_overlap = data.frame(genes = c(98,40,21,7,6,2,1,1,1,1), peaks = c(1,2,3,4,5,6,7,9,10,17), pourcentage = c("55.06%","22.47%","11.80%","3.93%","3.37%","1.12%","0.56%","0.56%","0.56%","0.56%"))
barplot(resultat_overlap$genes, names.arg = resultat_overlap$peaks, xlab = "Number of peaks", ylab = "Number of genes", legend.text = resultat_overlap$pourcentage, col = c('#67001f','#b2182b','#d6604d','#f4a582','#fddbc7','#f7f7f7','#2166ac','#2166ac','#2166ac','#2166ac'))
dist_tab$factor = rep("gene", nrow(dist_tab))
fig = ggplot(dist_tab, aes(x= factor,y = distance, color = factor)) + geom_violin() +scale_fill_brewer(palette="Dark2") +ggtitle("Distribution de la distance des pics aux gènes")
ggplotly(fig)
vioplot(dist_tab_peak_gene_5kb$distance,col="blue", main = "Distribution de la distance des pics aux gènes", ylab = "distance (pb)")
dist_tab_peak_gene_5kb$factor = rep("gene", nrow(dist_tab_peak_gene_5kb))
fig = ggplot(dist_tab_peak_gene_5kb, aes(x= factor,y = distance, color = factor)) + geom_violin() +scale_fill_brewer(palette="Dark2") +ggtitle("Distribution de la distance des pics aux gènes (0-5kb)")
ggplotly(fig)
dist_tab_peak_gene_10kb$factor = rep("gene", nrow(dist_tab_peak_gene_10kb))
fig = ggplot(dist_tab_peak_gene_10kb, aes(x= factor,y = distance, color = factor)) + geom_violin() +scale_fill_brewer(palette="Dark2") +ggtitle("Distribution de la distance des pics aux gènes (0-10kb)")
ggplotly(fig)
t = dist_tab[dist_tab$distance==0 ,]
unique(t$gene)
## [1] "CG4297" "CG3036" "senju" "stai"
## [5] "mmy" "Pvf3" "CG7149" "LKRSDH"
## [9] "pes" "CG7627" "grk" "Acer"
## [13] "CG17834" "Sema1a" "Spn31A" "CG7384"
## [17] "piwi" "Samuel" "dmGlut" "CG6770"
## [21] "vir-1" "CG15279" "lace" "c(2)M"
## [25] "Socs36E" "ham" "CG9331" "CG42238"
## [29] "CG2201" "CR43148" "lt" "ap"
## [33] "CG43340" "CG1882" "pdm3" "CG12780"
## [37] "CG8213" "hig" "Cyp4p3" "wun"
## [41] "brp" "CG12926" "Mos" "CR45140"
## [45] "CG6553" "CR44367" "Cyp6a22" "Cyp6a20"
## [49] "CG30091" "Cdk4" "CR44375" "mbl"
## [53] "CG15111" "cora" "CG11961" "sm"
## [57] "exu" "Lapsyn" "CG9485" "Acox57D-p"
## [61] "CG34370" "CG2852" "CG3831" "CG9815"
## [65] "ItgaPS5" "Orcokinin" "CR45805" "CR43334"
## [69] "bab2" "rho" "pns" "CG5707"
## [73] "CG45186" "Src64B" "axo" "CG18769"
## [77] "CR32385" "CG8620" "CG8596" "pst"
## [81] "Fhos" "bol" "path" "CG6084"
## [85] "Adk1" "mirr" "nuf" "fz"
## [89] "bbg" "Gbs-70E" "sff" "CG7341"
## [93] "CG9629" "CG13252" "Eip78C" "CG7470"
## [97] "CR45174" "CG11739" "CG34357" "Cdep"
## [101] "Cerk" "alpha-Est7" "CR45549" "CR18228"
## [105] "CG8112" "CG18473" "RhoL" "hth"
## [109] "CG14688" "CG5281" "ClC-a" "GC1"
## [113] "GstD9" "GstD1" "GstD3" "NANS"
## [117] "sqd" "CG34383" "CG14855" "RpS5b"
## [121] "CR45600" "CG3984" "alpha-Man-IIb" "spn-E"
## [125] "glob1" "CG6006" "CG6126" "Fas1"
## [129] "Mvl" "dnd" "wake" "Lsd-1"
## [133] "CG33111" "Pli" "CG6178" "crb"
## [137] "mld" "CG10550" "CG14545" "CG31324"
## [141] "CG12290" "Tusp" "CG12428" "CG11897"
## [145] "Zip99C" "CG9743" "Fer1HCH" "CG1544"
## [149] "dati" "zfh2" "apolpp" "Cadps"
## [153] "CG3777" "CG32816" "CG32795" "CR44498"
## [157] "CG14438" "CG9650" "CR44357" "CG42797"
## [161] "Gip" "l(1)G0289" "Karl" "CG43921"
## [165] "Pde9" "rdgB" "inaE" "sog"
## [169] "Rhp" "dpr18" "Grip84" "Hers"
## [173] "amn" "CG32521" "l(1)G0196" "CR43493"
## [177] "CR44997" "fog"
nrow(dist_tab[dist_tab$distance>0 & dist_tab$distance<=1000,])
## [1] 30
nrow(dist_tab[dist_tab$distance> 1000 & dist_tab$distance<=3000,])
## [1] 29
nrow(dist_tab[dist_tab$distance> 3000 & dist_tab$distance<=5000,])
## [1] 10
nrow(dist_tab[dist_tab$distance> 5000 & dist_tab$distance<=10000,])
## [1] 16
nrow(dist_tab[dist_tab$distance> 10000 ,])
## [1] 22
pielabels = c("0kb","0-1kb","1kb-3kb","3kb-5kb","5kb-10kb",">10kb")
resultat = data.frame(distance = c("0kb","0-1kb","1kb-3kb","3kb-5kb","5kb-10kb",">10kb"), genes = c(178,30,29,10,16,22), pourcentage = c("62.5%","10.5%","10.2%","3.5%","5.6%","7.7%"))
fig <- plot_ly(resultat, labels = ~distance, values = ~genes, type = 'pie')
fig <- fig %>% layout(title = 'Distance des gènes cibles aux pics ')
pie(resultat$genes, labels = resultat$pourcentage, col = c("#ffffff","#e9a3c9","#c51b7d","#e6f5d0","#a1d76a","#4d9221"), main = "Distance between Svb peaks and its target genes" )
legend("bottomright",legend=pielabels,fill= c("#ffffff","#e9a3c9","#c51b7d","#e6f5d0","#a1d76a","#4d9221"))
La colonnes chr, start , end correspondent aux coordonnées des pics dans les tableaux dist_tab_etc
B1_peaks_tab = as.data.frame(B1.peaks)
B1_peaks_tab$names = rownames(B1_peaks_tab)
B1_peaks_tab$row = row(B1_peaks_tab)[,1]
dist_tab$chr = B1_peaks_tab$seqnames[match( dist_tab$subjectHits, B1_peaks_tab$row )]
dist_tab$start = B1_peaks_tab$start[match( dist_tab$subjectHits, B1_peaks_tab$row )]
dist_tab$end = B1_peaks_tab$end[match( dist_tab$subjectHits, B1_peaks_tab$row )]
dist_tab$names = B1_peaks_tab$names[match( dist_tab$subjectHits, B1_peaks_tab$row )]
dist_tab$scores = B1_peaks_tab$score[match( dist_tab$subjectHits, B1_peaks_tab$row )]
dist_tab$fold = B1_peaks_tab$fold_enrichment[match( dist_tab$subjectHits, B1_peaks_tab$row )]
datatable(dist_tab[,3:10], filter = "top", caption = "Distance entre les gènes cibles et les pics de Shavenbaby")
dist_tab_gene_peak_10kb = dist_tab[dist_tab$distance<=10000,]
datatable(dist_tab_gene_peak_10kb[,3:10], filter = "top", caption = "Les gènes cibles et les pics de Shavenbaby dont la distance est <= 10kb")
dist_tab_gene_peak_5kb = dist_tab[dist_tab$distance<=5000,]
datatable(dist_tab_gene_peak_5kb[,3:10], filter = "top", caption = "Les gènes cibles et les pics de Shavenbaby dont la distance est <= 5kb")
dist_tab_peak_gene_5kb_10kb = dist_tab[dist_tab$distance>5000 & dist_tab$distance<=10000,]
datatable(dist_tab_peak_gene_5kb_10kb[,3:10], filter = "top", caption = "Les gènes cibles et les pics de Shavenbaby dont la distance est comprise entre 5 et 10kb")
dist_tab_peak_gene_audela10kb = dist_tab[dist_tab$distance> 10000 ,]
datatable(dist_tab_peak_gene_audela10kb[,3:10], filter = "top", caption = "Les gènes cibles et les pics de Shavenbaby dont la distance est supérieure à 10kb")
peak_overlaping_gene = dist_tab[dist_tab$distance==0,]
peak_overlaping_gene_GR = GRanges(seqnames = Rle(peak_overlaping_gene$chr),
ranges=IRanges(start = peak_overlaping_gene$start,end = peak_overlaping_gene$end, names = peak_overlaping_gene$names),
strand = Rle(rep("*",nrow(peak_overlaping_gene))),
score = peak_overlaping_gene$scores,
fold_enrichment = peak_overlaping_gene$fold)
datatable(peak_overlaping_gene[,3:10], filter = "top", caption = "Les gènes cibles et les pics de Shavenbaby chevauchant ")
non_associed_peaks= B1_peaks_tab[B1_peaks_tab$names%!in%dist_tab$names,]
gene_0_5kb = target_set_tab[,1:5][unique(dist_tab_gene_peak_5kb$gene),]
gene_0_5kb$gene = rownames(gene_0_5kb)
datatable(gene_0_5kb ,caption = "Genes cibles directes (0-5kb)")
gene_5_10kb = target_set_tab[,1:5][unique(dist_tab_peak_gene_5kb_10kb$gene),]
gene_5_10kb$gene = rownames(gene_5_10kb)
datatable(gene_5_10kb,caption = "Gènes cibles directes (5-10kb)")
gene_audela_10kb = target_set_tab[,1:5][unique(dist_tab_peak_gene_audela10kb$gene),]
gene_audela_10kb$gene = rownames(gene_audela_10kb)
datatable(gene_audela_10kb, caption = "Gènes cibles ( au-delà de 10kb")
Dm6_gene_tab = as.data.frame(Dm6_gene)
Dm6_gene_tab$gene = rownames(Dm6_gene_tab)
non_associed_gene_peaks = Dm6_gene_tab[,1:5][Dm6_gene_tab$gene%!in%dist_tab$gene,]
non_associed_gene_peaks$gene = rownames(non_associed_gene_peaks)
promoters_gene_0_5kb = promoters( GRanges(seqnames = gene_0_5kb$seqnames,
ranges = IRanges(start = gene_0_5kb$start, end = gene_0_5kb$end, names = gene_0_5kb$gene),
strand = gene_0_5kb$strand) )
promoters_gene_0_5kb_tab = as.data.frame(promoters_gene_0_5kb)
promoters_gene_5_10kb = promoters( GRanges(seqnames = gene_5_10kb$seqnames,
ranges = IRanges(start = gene_5_10kb$start, end = gene_5_10kb$end, names = gene_5_10kb$gene),
strand = gene_5_10kb$strand) )
promoters_gene_5_10kb_tab = as.data.frame(promoters_gene_5_10kb)
promoters_gene_audela_10kb = promoters( GRanges(seqnames = gene_audela_10kb$seqnames,
ranges = IRanges(start = gene_audela_10kb$start, end = gene_audela_10kb$end, names = gene_audela_10kb$gene),
strand = gene_audela_10kb$strand))
promoters_gene_audela_10kb_tab = as.data.frame(promoters_gene_audela_10kb)
promoters_non_associed_gene_peaks = promoters( GRanges(seqnames = non_associed_gene_peaks$seqnames,
ranges = IRanges(start = non_associed_gene_peaks$start, end = non_associed_gene_peaks$end, names = non_associed_gene_peaks$gene),
strand = non_associed_gene_peaks$strand))
promoters_gene_non_associed_gene_peaks_tab = as.data.frame(promoters_non_associed_gene_peaks)
promoters_gene_non_associed_gene_peaks_tab$gene = rownames(promoters_gene_non_associed_gene_peaks_tab)
flanked_gene_0_5kb = flank( GRanges(seqnames = gene_0_5kb$seqnames,
ranges = IRanges(start = gene_0_5kb$start, end = gene_0_5kb$end, names = gene_0_5kb$gene),
strand = gene_0_5kb$strand) , 5000,start=TRUE)
flanked_gene_0_5kb_tab = as.data.frame(flanked_gene_0_5kb)
flanked_gene_5_10kb = flank( GRanges(seqnames = gene_5_10kb$seqnames,
ranges = IRanges(start = gene_5_10kb$start, end = gene_5_10kb$end, names = gene_5_10kb$gene),
strand = gene_5_10kb$strand) , 5000 , start = T)
flanked_gene_5_10kb_tab = as.data.frame(flanked_gene_5_10kb)
flanked_gene_audela_10kb = flank( GRanges(seqnames = gene_audela_10kb$seqnames,
ranges = IRanges(start = gene_audela_10kb$start, end = gene_audela_10kb$end, names = gene_audela_10kb$gene),
strand = gene_audela_10kb$strand), 5000 , start = T )
flanked_gene_audela_10kb_tab = as.data.frame(flanked_gene_audela_10kb)
flanked_non_associed_gene_peaks = flank( GRanges(seqnames = non_associed_gene_peaks$seqnames,
ranges = IRanges(start = non_associed_gene_peaks$start, end = non_associed_gene_peaks$end, names = non_associed_gene_peaks$gene),
strand = non_associed_gene_peaks$strand), 5000, start = T )
flanked_gene_non_associed_gene_peaks_tab = as.data.frame(flanked_non_associed_gene_peaks)
flanked_gene_non_associed_gene_peaks_tab$gene = rownames(flanked_gene_non_associed_gene_peaks_tab)
Pour les gènes contenant des pics j’ai regardé sur IGV dans quel région génomique se trouvaient le pic.
result_overlappant_pic = data.frame(type = c("promotor","5'UTR","1st intron","other introns","other","exons"), nombre = c(71,124,50,55,39,15), pourcentage = c("20.06%","30.03%","14.12%","15.54%","11.02%","4.24%"))
pielabel_overlappant = result_overlappant_pic$pourcentage
pie(result_overlappant_pic$nombre, labels = result_overlappant_pic$type, col = c("#ffffff","#e9a3c9","#c51b7d","#e6f5d0","#a1d76a","#4d9221"), main = "Overlapping peaks genomic distribution" )
legend("bottomright",legend=pielabel_overlappant,fill= c("#ffffff","#e9a3c9","#c51b7d","#e6f5d0","#a1d76a","#4d9221"))
fig <- plot_ly(result_overlappant_pic, labels = ~type, values = ~nombre, type = 'pie')
fig <- fig %>% layout(title = 'Distribution génomique des pics')
fig
Pour faire autrement qu’à l’oeil j’utilise le package ChIPpeakAnno
if(require(TxDb.Dmelanogaster.UCSC.dm6.ensGene)){
TxDb <- TxDb.Dmelanogaster.UCSC.dm6.ensGene
exons <- exons(TxDb, columns=NULL)
fiveUTRs <- unique(unlist(fiveUTRsByTranscript(TxDb)))
#TSS = getAnnotation(TxDb)
overlap_region = assignChromosomeRegion(peak_overlaping_gene_GR, exons, fiveUTRs,TSS, nucleotideLevel=FALSE,
precedence=c("Promoters", "immediateDownstream",
"fiveUTRs", "threeUTRs",
"Exons", "Introns"),
TxDb=TxDb)
}
## Warning in valid.GenomicRanges.seqinfo(x, suggest.trim = TRUE): GRanges object contains 21 out-of-bound ranges located on sequences
## chr4, chrY_DS483742v1_random, chrY_DS483875v1_random,
## chrY_DS484142v1_random, chrY_DS484530v1_random, chrY_DS484909v1_random,
## chrY_DS485423v1_random, chrY_DS485523v1_random, chrY_DS485938v1_random,
## chrUn_CP007102v1, chrUn_DS483910v1, chrUn_DS484898v1, chrY, and
## chrUn_DS484581v1. Note that ranges located on a sequence whose length
## is unknown (NA) or on a circular sequence are not considered
## out-of-bound (use seqlengths() and isCircular() to get the lengths and
## circularity flags of the underlying sequences). You can use trim() to
## trim these ranges. See ?`trim,GenomicRanges-method` for more
## information.
## Warning in assignChromosomeRegion(peak_overlaping_gene_GR, exons, fiveUTRs, :
## peaks.RD has sequence levels not in TxDb.
res = as.data.frame(overlap_region$percentage)
res$cat = rownames(res)
fig <- plot_ly(res, labels = ~cat, values = ~overlap_region$percentage, type = 'pie')
fig <- fig %>% layout(title = 'Distribution génomique des pics')
fig
genes_manquant = target_set_tab$gene[unique(overlaps_RNA_ChIPseq$gene)%!in%bind_updown$gene]
Avec les deux méthodes gene centrique on trouve à peu près le même nombre de gènes chevauchant.Cependant quand on regarde quels sont les gènes qui diffèrent, mmy, mtsh, CG7627, piwi, Socs36E, CG9331, CG42238, CR43148, CG17018, CG43340, CG1882, PPO2, CG13743, wun, brp, CG12926, PCB, Prx2540-2, Mos, cbc, CG6357, CG6553, Hr51, CG43107, CR44386, mbl, GstE6, CG18190, CG15128, CG34370, CG2852, CG3831, rho, CG18769, pst, CG7194, CG6084, CG43085, CG9629, Eip78C, CR43426, Pzl, Cdep, Cerk, RhoL, hth, l(3)neo38, GstD9, GstD7, Cyp9f2, CG14855, CR43472, MtnE, MtnD, dnd, CR43475, CG17119, CG33111, CG31436, CG10550, CG12290, Tusp, CG12428, CG11897, Zip99C, Fer1HCH, dati, zfh2, apolpp, CG9650, CG32700, CG42797, CG45061, Gip, CR44962, sog, Grip84, car, Hers on voit qu’il y a pas mal de différence. De plus en regardant les gènes qui manquent dans la méthode 1 vis à vis de la méthode 2 on peut voir que certains gènes présentent des pics en leur sein.
Je privilégie la Méthode 2 de la méthode Gene centrique qui s’appuie sur les coordonnées des gènes et des pics en calculant la distance pour tous en ne donnant pas de poids contrairement à la Méthode 2 Pics centriques
En conclusion pour les 285 gènes cibles de Svb, le facteur de transcription se fixent pour 178 gènes sur le gène. En majorité dans la région soit du promoteur, soit dans la région 5’UTR. Pour les autres gènes, le facteur de transcription se fixent à moins de 5kb du gène régulé, ce qui correspond à ce que l’on connait sur la régulation des gènes cibles par un facteur de transcription chez la drosophile. Shavenbaby régule l’expression de 247 gènes cibles directes.